knitr::opts_chunk$set(echo = TRUE)
suppressWarnings(suppressMessages(library(data.table)))
suppressWarnings(suppressMessages(library(readr)))
suppressWarnings(suppressMessages(library(plotly)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(maps)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(wordcloud)))
MS_dataset<- read_csv("C:/Users/riyac/Documents/Mass_Shootings_Dataset_Ver_5.csv", col_types = cols(Date = col_date(format = "%m/%d/%Y")))
head(MS_dataset, 5)
## # A tibble: 5 x 21
## `S#` Title Location
## <int> <chr> <chr>
## 1 1 Texas church mass shooting Sutherland Springs, TX
## 2 2 Walmart shooting in suburban Denver Thornton, CO
## 3 3 Edgewood businees park shooting Edgewood, MD
## 4 4 Las Vegas Strip mass shooting Las Vegas, NV
## 5 5 San Francisco UPS shooting San Francisco, CA
## # ... with 18 more variables: Date <date>, `Incident Area` <chr>,
## # `Open/Close Location` <chr>, Target <chr>, Cause <chr>, Summary <chr>,
## # Fatalities <int>, Injured <int>, `Total victims` <int>, `Policeman
## # Killed` <int>, Age <dbl>, `Employeed (Y/N)` <int>, `Employed
## # at` <chr>, `Mental Health Issues` <chr>, Race <chr>, Gender <chr>,
## # Latitude <dbl>, Longitude <dbl>
tail(MS_dataset, 5)
## # A tibble: 5 x 21
## `S#` Title Location Date
## <int> <chr> <chr> <date>
## 1 319 Clara Barton Elementary School Chicago, Illinois 1974-01-17
## 2 320 New Orleans Police Shootings New Orleans, Louisiana 1972-12-31
## 3 321 St. Aloysius Church Spokane, Washington 1971-11-11
## 4 322 Rose-Mar College of Beauty Mesa, Arizona 1966-11-12
## 5 323 University of Texas at Austin Austin, Texas 1966-08-01
## # ... with 17 more variables: `Incident Area` <chr>, `Open/Close
## # Location` <chr>, Target <chr>, Cause <chr>, Summary <chr>,
## # Fatalities <int>, Injured <int>, `Total victims` <int>, `Policeman
## # Killed` <int>, Age <dbl>, `Employeed (Y/N)` <int>, `Employed
## # at` <chr>, `Mental Health Issues` <chr>, Race <chr>, Gender <chr>,
## # Latitude <dbl>, Longitude <dbl>
MS_dataset <- data.table(MS_dataset)
summary(MS_dataset)
## S# Title Location
## Min. : 1.0 Length:323 Length:323
## 1st Qu.: 81.5 Class :character Class :character
## Median :162.0 Mode :character Mode :character
## Mean :162.0
## 3rd Qu.:242.5
## Max. :323.0
##
## Date Incident Area Open/Close Location
## Min. :1966-08-01 Length:323 Length:323
## 1st Qu.:2001-01-15 Class :character Class :character
## Median :2013-11-01 Mode :character Mode :character
## Mean :2007-11-16
## 3rd Qu.:2015-12-02
## Max. :2017-11-05
##
## Target Cause Summary Fatalities
## Length:323 Length:323 Length:323 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 1.000
## Mode :character Mode :character Mode :character Median : 3.000
## Mean : 4.437
## 3rd Qu.: 5.500
## Max. :59.000
##
## Injured Total victims Policeman Killed Age
## Min. : 0.000 Min. : 3.00 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 4.00 1st Qu.:0.0000 1st Qu.: 20.50
## Median : 3.000 Median : 5.00 Median :0.0000 Median : 34.00
## Mean : 6.176 Mean : 10.26 Mean :0.1293 Mean : 78.15
## 3rd Qu.: 5.000 3rd Qu.: 9.00 3rd Qu.:0.0000 3rd Qu.: 42.00
## Max. :527.000 Max. :585.00 Max. :5.0000 Max. :1932.00
## NA's :6 NA's :144
## Employeed (Y/N) Employed at Mental Health Issues
## Min. :0.0000 Length:323 Length:323
## 1st Qu.:0.0000 Class :character Class :character
## Median :1.0000 Mode :character Mode :character
## Mean :0.6269
## 3rd Qu.:1.0000
## Max. :1.0000
## NA's :256
## Race Gender Latitude Longitude
## Length:323 Length:323 Min. :21.33 Min. :-161.79
## Class :character Class :character 1st Qu.:33.57 1st Qu.:-110.21
## Mode :character Mode :character Median :36.44 Median : -88.12
## Mean :37.23 Mean : -94.43
## 3rd Qu.:41.48 3rd Qu.: -81.70
## Max. :60.79 Max. : -69.71
## NA's :20 NA's :20
MS_dataset[,Month:=as.factor(month(Date))]
MS_dataset[,Year_n:=as.numeric(year(Date))]
MS_dataset[,Year:=as.factor(year(Date))]
MS_dataset[Gender=='M',Gender:="Male"]
MS_dataset[Gender=='M/F',Gender:="Male/Female"]
MS_dataset[is.na(Gender),Gender:="Unknown"]
MS_dataset[,Gender:=as.factor(Gender)]
plot_ly(data = MS_dataset
,type = 'bar'
,mode = 'markers'
,hoverinfo = 'text'
,x = ~Year
,y = ~ `Total victims`
,color = 'Red'
,alpha = 0.9
,text = ~paste(
'Fatalities : ', Fatalities
,'\n Injured : ', Injured
)) %>%
layout(title = "Number of Total victims by years"
, xaxis = list(title = "")
, yaxis = list(title = "Number of victims"))
plot_ly(data = MS_dataset
,type = 'histogram'
,mode = 'markers'
,x = ~Year
,alpha = 0.9) %>%
layout(title = "Number of incidents by years"
, xaxis = list(title = "")
, yaxis = list(title = "Number of incidents"))
plot_ly(data = MS_dataset
,type = 'histogram'
,mode = 'markers'
,x = ~Month
,alpha = 0.9) %>%
layout(title = "Number of incidents by month"
, xaxis = list(title = "Month")
, yaxis = list(title = "Number of incidents"))
MS_dataset[`Mental Health Issues`=="unknown",`Mental Health Issues`:="Unknown"]
# set colors for first pie chart
colors_pie1 <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')
plot_ly(data = MS_dataset[,.(`Total victims`,`Mental Health Issues`)]
,type = 'pie'
,labels = ~`Mental Health Issues`
,values = ~`Total victims`
,textposition = 'inside'
,insidetextfont = list(color = '#FFFFFF')
,marker = list(colors = colors_pie1,
line = list(color = '#FFFFFF', width = 1)))%>%
layout(title = "Mental Health Issues",
showlegend = T)
MS_dataset$State <- sapply(MS_dataset$Location, function(x){
temp <- strsplit(x, split = ",")
sapply(temp, function(y){y[2]
})
})
plot_ly(data = MS_dataset[!is.na(State),.('Number of incidents'= uniqueN(`S#`)),by=State]
,type = 'pie'
,labels = ~State
,values = ~`Number of incidents`
,textposition = 'inside'
,insidetextfont = list(color = '#FFFFFF')
,marker = list(colors = colors_pie1,
line = list(color = '#FFFFFF', width = 1)))%>%
layout(title = "Number of incidents by States",
showlegend = T)
# Clearing and merging data in the Race field
MS_dataset[Race=="unclear",Race:="Unknown"]
MS_dataset[is.na(Race),Race:="White"]
MS_dataset[Race=="Black American or African American"
| Race=="black"
| Race=="Black American or African American/Unknown"
,Race:="Black"]
MS_dataset[Race=="White American or European American"
| Race=="White American or European American/Some other Race"
| Race=="white"
,Race:="White"]
MS_dataset[Race=="Asian American"
| Race=="Asian American/Some other race"
,Race:="Asian"]
MS_dataset[Race=="Unknown",Race:="Other"]
MS_dataset[Race=="Two or more races",Race:="Other"]
MS_dataset[Race=="Some other race",Race:="Other"]
MS_dataset[Race=="Native American or Alaska Native",Race:="Native American"]
plot_ly(data = MS_dataset[,.('Total victims'= sum(`Total victims`)),by=.(Race,Year)]
,type = 'bar'
,mode = 'markers'
,x = ~Year
,y = ~`Total victims`
,color =~Race
,alpha = 0.9) %>%
layout(title = "Total victims by Race"
, showlegend = T
, barmode = 'stack'
, position = 1
, xaxis = list(title = "")
, yaxis = list(title = "")
, legend = list(x = 0, y = 1)
, hovermode = 'compare')
#load us map data
all_states <- map_data("state")
#plot all states with ggplot
p <- ggplot()
p <- p + geom_polygon(data=all_states, aes(x=long, y=lat, group = group),colour="black", fill="white")
p <-
p + geom_point(data=MS_dataset[Longitude >=-140,]
, aes(x=Longitude, y=Latitude
,size = `Total victims`
,color = Fatalities)
,alpha = 0.6) +
scale_color_gradient(low = "red", high = "black") +
ggtitle("Total victims & Fatalities on US map")
ggplotly(
p
)